Homepage
The formatted source code for this file is here.
And a raw version here.
Previous work by Youngser Park can be found here.

1 Data

Here we read in the data and select a random half of it for exploration.

featFull <- fread("../data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)

### Setting a seed and creating an index vector
### to select half of the data
set.seed(2^10)
half1 <- sample(dim(featFull)[1],dim(featFull)[1]/2)
half2 <- setdiff(1:dim(featFull)[1],half1)

feat <- featFull[half1,]
dim(feat)
# [1] 559649    144
## Setting the channel names
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
              'psd','glur2','nmdar1','nr2b','gad','VGAT',
              'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
              'NOS','TH','VACht','Synapo','tubuli','DAPI')

## Setting the channel types
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
                  'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
                  'in.pre','in.post','in.post','in.post','in.pre.small','other',
                  'ex.post','other','other','ex.post','none','none')

channel.type2 <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','other',
                   'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
                   'in.pre','in.post','in.post','in.post','other','other',
                   'ex.post','other','other','ex.post','other','other')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel

## Createing factor variables for channel and channel type sorted properly
ffchannel <- (factor(channel.type,
    levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
    ))
fchannel <- as.numeric(factor(channel.type,
    levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
    ))
ford <- order(fchannel)


## Setting up colors for channel types
Syncol <-  c("#197300","#5ed155","#660000","#cc0000","#ff9933","#0000cd","#ffd700")
Syncol3 <- c("#197300","#197300","#cc0000","#cc0000","#0000cd","#0000cd","#0000cd")
ccol <- Syncol[fchannel]
ccol3 <- Syncol3[fchannel]

exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE)
exCol<-exType;levels(exCol) <- c("#197300","#990000","#0000cd");
exCol <- as.character(exCol)

fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)
mycol3 <- colorpanel(100, "blue","white","red")

1.1 Data transformations

f <- lapply(1:6,function(x){seq(x,ncol(feat),by=nfeat)})
featF <- lapply(f,function(x){subset(feat,select=x)})

featF0 <- featF[[1]]
f01e3 <- 1e3*data.table(apply(X=featF0,2,function(x){((x-min(x))/(max(x)-min(x)))}))

fs <- f01e3

### Taking log_10 on data + 1.
log1f <- log10(featF0 + 1)
slog1f <- data.table(scale(log1f, center=TRUE,scale=TRUE))

We now have the following data sets:

  • featF0: The feature vector looking only at the integrated brightness features.
  • fs: The feature vector scaled between \([0,1000]\).
  • logf1: The feature vector, plus one, then \(log_{10}\) is applied.
  • slog1f: The feature vector, plus one, \(log_{10}\), then scaled by subtracting the mean and dividing by the sample standard deviation.

2 Marker Exploration

2.1 Correlation Matrix of markers

corrf <- lapply(lapply(featF, cor), function(x) x[ford, ford])

titles <- paste('Correlation Matrix of', paste0("F", 0:5))
par(mfrow = c(3,2))
for(i in 1:length(corrf)) {
  corrplot(corrf[[i]],method="color",tl.col=ccol[ford], 
           tl.cex=0.8, mar=c(1,0,1,1.5))
  title(titles[i])
}
Figure 1: Correlation on untransformed F0 data, reordered by synapse type.

bford <- order(rep(fchannel,each=6))
nord <- Reduce('c', f)
cr <- rep(ccol, each=6)
corrfB <- cor(feat)[bford,bford]
corrplot(corrfB,method="color",tl.col=cr[bford],tl.cex=0.75)
Figure 2: Full Correlation on untransformed F0-F5 data, reordered by synapse type.

## Energy Matrix: Distance Correlation T-test
computeDcor <- function(x) {
  set.seed(317)
  sam1 <- sample(dim(x)[1], min(1e3,dim(x)[1]))
  tmp <- as.data.frame((x[sam1,]))
  
  combcols <- t(combn(24,2))
  
  dc <- foreach(i = 1:dim(combcols)[1]) %dopar% {
         set.seed(331*i)
         dcor.ttest(x=tmp[,combcols[i,1]],y=tmp[,combcols[i,2]])
         }
  
  ms <- matrix(as.numeric(0),24,24)
  mp <- matrix(as.numeric(0),24,24)
  
  for(i in 1:length(dc)){
      ms[combcols[i,1],combcols[i,2]] <- dc[[i]]$statistic
      ms[combcols[i,2],combcols[i,1]] <- dc[[i]]$statistic
      mp[combcols[i,1],combcols[i,2]] <- dc[[i]]$p.val
      mp[combcols[i,2],combcols[i,1]] <- dc[[i]]$p.val
  }
  
  rownames(ms) <- colnames(x)
  rownames(mp) <- colnames(x)
  colnames(ms) <- colnames(x)
  colnames(mp) <- colnames(x)
  
  diag(ms) <- as.numeric(0)
  diag(mp) <- as.numeric(1)
  return(list(ms, mp))
}

mdcor <- lapply(featF, computeDcor)
cl5 <- colorRampPalette(c("white", "blue"))
gr5 <- colorRampPalette(c("darkgreen", "white", "white"))
bl5 <- colorRampPalette(c("blue", "red"))

sTitle <- paste("dcor.ttest statistic", paste0("F", 0:5))
pTitle <- paste("dcor.ttest log10(p-value)", paste0("F", 0:5))


par(mfcol=c(6,2), oma=c(1,1,1,1))
for(i in 1:length(mdcor)){
  corrplot(mdcor[[i]][[1]][ford,ford],is.corr=FALSE,method="color",
           tl.col=ccol[ford], tl.cex=0.8, mar=c(1,0,1,1.5))
  title(sTitle[i]) 
  corrRect(as.numeric(table(fchannel)),col=Syncol,lwd=4)
}

for(i in 1:length(mdcor)){
  corrplot((log10(mdcor[[i]][[2]][ford,ford]+.Machine$double.eps)),is.corr=FALSE,method="color",tl.col=ccol[ford], 
           tl.cex=0.8, mar=c(1,0,1,1.5),
           p.mat=mdcor[[i]][[2]][ford,ford], 
           sig.level=0.01, pch = 'x', pch.cex=0.5)
  title(pTitle[i])
  corrRect(as.numeric(table(fchannel)),col=Syncol,lwd=4)
}
Figure 3: F0-5: Distance correlation t-test statistic and p-value matrices.

The X’s in the above right figure denote a p-value greater than 0.01.

2.1.1 Shared significance

mp <- lapply(mdcor, '[[', 2)

MP <- abind(mp, along = 3)

out <- 
  foreach(i = 1:24, .combine='cbind') %:%
    foreach(j = 1:24, .combine = 'rbind') %do% {
      all(MP[i,j,] <= 0.01)
}

rownames(out) <- colnames(out) <- channel
corrplot(out,is.corr=FALSE,method="color",
         tl.col=ccol[ford],  mar=c(0,0,1,0))
corrRect(as.numeric(table(fchannel)),col=Syncol,lwd=4)
title("Shared significances across F0-5")
Figure 4: Shared significances

A blue \(a_{ij}\) element in the above matrix plot implies that the null hypothesis that markers \(i\) and \(j\) are independant was rejected at signficance level 0.01.

2.2 Scatterplots

We will run PCA on the untransformed correlation matrix so the data can be viewed in 2-dimensions. The colors correspond to synapse type.

pcaL <- lapply(corrf, prcomp, center=TRUE, scale=TRUE)
titlepca <- paste("PCA on ", paste0("cor(F", 0:5, ')'))
for(i in 1:length(pcaL)) { 
  pairs(pcaL[[i]]$x[,1:3], col=ccol3[ford],pch=20, cex=2,
        main=titlepca[i])
}
Figure 5: PCA of untransformed correlation matrix

Figure 5: PCA of untransformed correlation matrix

Figure 5: PCA of untransformed correlation matrix

Figure 5: PCA of untransformed correlation matrix

Figure 5: PCA of untransformed correlation matrix

Figure 5: PCA of untransformed correlation matrix

pcaB <- prcomp(corrfB,center=TRUE, scale=TRUE)
pairs(pcaB$x[,1:3], col=cr[bford],pch=20, cex=2)
Figure 6: PCA of untransformed correlation matrix

plot(pcaB$x[,1:3], col=cr[bford],pch=20, cex=2)
text(pcaB$x[,1:2], labels=rownames(pcaB$x), pos=4, col=cr[bford])
Figure 7: PC2 v PC1 of untransformed correlation matrix

pcaB <- pcaB$x
rgl::plot3d(pcaB[,1],pcaB[,2],pcaB[,3],type='s',col=cr[bford], size=1)
rgl::rgl.texts(pcaB[,1],pcaB[,2],pcaB[,3],abbreviate(rownames(pcaB)), col=cr[bford], adj=c(0,2))
subid <- currentSubscene3d()
rglwidget(elementId="rgl-pcaB",width=720,height=720)

3 LDA

el2 <- lapply(pcaL, function(x) getElbows(x$sdev, plot=FALSE))
del2 <- lapply(pcaL, function(x) {
                 elb <- getElbows(x$sdev, plot=FALSE)[1]
                 return(x$x[,1:elb])
        })

d1 <- lapply(del2, function(x) data.frame(type = exType, x))
lda.fit <- lapply(d1, function(x) lda(type ~ ., data = x) )
rf.fit <- lapply(d1, function(x) randomForest(type ~ ., data=x))
rf.pred <- lapply(rf.fit, '[[', 3)

LDA was run using only the principal components up to and including the first elbow from ZG from the untransformed correlation matrix.

titlesvor <- paste("LDA decision boundaries for", paste0("F", 0:5))
voronoidf <- lapply(lapply(lda.fit, '[[', 3), data.frame)
#voronoidf <- data.frame(x=lda.fit$means[,1],y=lda.fit$means[,2])

#This creates the voronoi line segments

par(mfrow = c(3,2))
for(i in 1:length(d1)){
  plot(d1[[i]][,2:3], col=ccol3[ford], pch=20, cex=1.5)
  title(titlesvor[i])
  text(d1[[i]][,2:3], labels=rownames(d1[[i]]), 
       pos=ifelse(d1[[i]][,2]<max(d1[[i]][,2] -0.5),4,2), 
       col=ccol3[ford], cex=1.2)

  deldir(x = voronoidf[[i]][,1],y = voronoidf[[i]][,2], rw = c(-15,15,-15,15), 
       plotit=TRUE, add=TRUE, wl='te')
  text(voronoidf[[i]], labels=rownames(voronoidf[[i]]), cex=1.5, pos=1)
}
Figure 9: Voronoi diagrams on class means from LDA on PCA of untransformed correlation matrices

truth <- lapply(d1, '[[', 1)
pred <- lapply(lapply(lda.fit, predict), '[[', 1)

for( i in 1:length(truth)) {
 print(table(truth = truth[[i]],pred = pred[[i]]) )
}
#        pred
# truth   ex in other
#   ex    10  0     1
#   in     1  5     0
#   other  1  0     6
#        pred
# truth   ex in other
#   ex    10  0     1
#   in     1  5     0
#   other  1  0     6
#        pred
# truth   ex in other
#   ex    10  0     1
#   in     1  5     0
#   other  0  2     5
#        pred
# truth   ex in other
#   ex     9  0     2
#   in     1  5     0
#   other  0  2     5
#        pred
# truth   ex in other
#   ex    10  0     1
#   in     1  5     0
#   other  1  0     6
#        pred
# truth   ex in other
#   ex    11  0     0
#   in     0  6     0
#   other  0  0     7
lda.err <- list()
for(i in 1:6) {
 tr <- as.numeric(truth[[i]])
 pr <- as.numeric(pred[[i]])
 lda.err[[i]] <- sum( tr != pr ) / length(tr)
}

Reduce('c', lda.err)
# [1] 0.1250000 0.1250000 0.1666667 0.2083333 0.1250000 0.0000000

The above gives the LDA error rates for each of the features.